##Einlesen der Tabellen und umwandeln der Tabellen in data frames

library(readxl)
## Warning: Paket 'readxl' wurde unter R Version 4.3.1 erstellt
library(dplyr)
## 
## Attache Paket: 'dplyr'
## Die folgenden Objekte sind maskiert von 'package:stats':
## 
##     filter, lag
## Die folgenden Objekte sind maskiert von 'package:base':
## 
##     intersect, setdiff, setequal, union
Endung <- "_dengue_extracted.xlsx"
Anfang <- "Dengue"

for (i in 2006:2020) {
  
  Dateiname <- paste0(i, Endung) #erstellen das Dateinamens
  Name_df <- paste0(Anfang, i) 
  assign(Name_df, read_excel(Dateiname)) #erstellen der Variable und zuweisen der Excel-Tabelle
  
  df_dengue <- get(Name_df)
  vec_data_names <- vector("character")
  
  vec_data_names <- sapply(df_dengue$Reporting_areas, function(g) {
    paste(g[1], i)
  })
  data_names <- matrix(vec_data_names,nrow = 77, ncol = 1)
  df_dengue <- cbind(data_names, df_dengue)
  assign(Name_df,df_dengue)
}
total_years = data.frame()
for (i in 2006:2020){
  a = get(paste0("Dengue",i))
  total_years = rbind(total_years, a)
}

#Rows als Zeitachse mit Monaten

m = 1
data_total = data.frame(col1 = character(0), col2 = character(0), col3 = numeric(0), month = character(0))
colnames(data_total) = c("data_names", "Reporting_areas", "dengue_cases", "month")
sorted_total_years = total_years[order(total_years$"Reporting_areas"),]
for (k in 1:1155){
  for (j in 3:14){
    square <- sorted_total_years[k, c(1, 2, j)]
    month = colnames(square[3])
    square <- cbind(square,month)
    colnames(square) = c("data_names", "Reporting_areas", "dengue_cases", "month")
    data_total <- rbind(data_total, square)
    colnames(data_total) = c("data_names", "Reporting_areas", "dengue_cases", "month")
    data_total[m,1] = paste(data_total[m,1], month[1])
    m = m + 1
  }
}

#adding temperature, longitude and latitude column

m = 1
temp_data = read_excel("era5_data_2006_2020_thailand_monmean.xlsx")
temp_data = as.data.frame(temp_data)
temp_data_sorted = data.frame( col1 = numeric(0), Longitude = numeric(0), Latitude = numeric(0))
for (i in 1:77) {
  for (j in 5:184) {
    temp_data_sorted[m,1] = temp_data[i,j]
    temp_data_sorted[m,2] = temp_data[i,3]
    temp_data_sorted[m,3] = temp_data[i,4]
    m = m + 1
  }
}
data_total = cbind(data_total, temp_data_sorted)
colnames(data_total)[5] = "temperature"

#Einlesen Population data

library(readxl)

Endung <- "_population.xlsx"
#empty dataframe for the data
total_population = data.frame(Reporting_areas = character(), population_count = numeric())

# stack years of 2006-2011 below each other
for (i in 2006:2011) {
  population_df  = data.frame()
  Dateiname <- paste0(i, Endung) #erstellen das Dateinamens
  population_df = read_excel(Dateiname) #erstellen der Variable und zuweisen der Excel-Tabelle
  vec_years = rep(i, times = 77)
  population_df = cbind(population_df,vec_years)
  names(population_df) = c("Reporting_areas", "population_count", "year")
  # multiply every row 12 times for each month
  repeat_count <- 12
  population_df_replicated <- population_df[rep(row.names(population_df), each = repeat_count), ]
  row.names(population_df_replicated) <- NULL
  
  #put all the years together
  total_population = rbind(total_population, population_df_replicated)
  
}

# stack years of 2012-2020 below each other and add to data_total
population_df = read_excel("2012-2020_population.xlsx")
colnames(population_df) = c("Reporting_areas", "population_count", "population_count", "population_count", "population_count", "population_count","population_count","population_count","population_count","population_count")
pop_placeholder = data.frame(Reporting_areas = character(), population_count = numeric(), year = numeric())
col_num = 2
for (i in 2012:2020) {
  year = rep(i, times = 77)
  pop_placeholder = rbind(pop_placeholder, cbind(population_df[,c(1,col_num)],year))
  names(pop_placeholder) = c("Reporting_areas", "population_count", "year")
  col_num = col_num + 1
}

# multiply every row 12 times for each month
repeat_count <- 12
population_df_replicated <- pop_placeholder[rep(row.names(pop_placeholder), each = repeat_count), ]
row.names(population_df_replicated) <- NULL

# combine all population data in each 
total_population = rbind(total_population, population_df_replicated)
sorted_total_population = total_population[order(total_population$"Reporting_areas"),]

data_total = cbind(data_total,sorted_total_population[,c(2,3)])
row.names(data_total) = NULL

adding time column

first_date = as.Date("2006-01-01")
last_date = as.Date("2020-12-31")

time_column_short = seq(first_date, last_date, by = "month")
time_column_short = as.Date(time_column_short)
time_column = rep(time_column_short, times = 77)

data_total = cbind(data_total, time_column)
#data_total = subset(data_total, select = -time_column)

#Detect missing values

detect_na <- function(x) {
  is_na <- x %in% c("N/A", "na", "NA", "n/a")
  replace(x, is_na, NA)
}
data_total <- apply(data_total, 2, detect_na)
rmv.rows = apply(data_total,1,function(x){sum(is.na(x))})
i.missing = which(rmv.rows >0)

#Inzindenz

data_total = as.data.frame(data_total)
data_total$dengue_cases = as.integer(data_total$dengue_cases)
## Warning: NAs durch Umwandlung erzeugt
data_total$population_count = as.integer(data_total$population_count)
dimensions = dim(data_total)
R= dimensions[1]

for(Zeile in 1:R){
  Population <- data_total$population_count[Zeile]
  Infection <- data_total$dengue_cases[Zeile]
  Incidence = (Infection/Population)*100000
  data_total$incidence[Zeile] <- Incidence
}

#create clean data frame

detect_na = function(x) {
  is_na = x %in% c("N/A", "na", "NA", "n/a")
  replace(x, is_na, NA)
}
data_total = apply(data_total, 2, detect_na)
rmv.rows = apply(data_total,1,function(x){sum(is.na(x))})
i.missing = which(rmv.rows >0)

clean_data_total = as.data.frame(data_total[-i.missing,])
num_col = c("temperature", "Longitude", "Latitude", "incidence")
int_col = c("dengue_cases","population_count","year")
date_col = c("time_column")
clean_data_total[int_col] = lapply(clean_data_total[int_col], as.integer)
clean_data_total[num_col] = lapply(clean_data_total[num_col], as.numeric)
clean_data_total[date_col] = lapply(clean_data_total[date_col], as.Date)
summary(clean_data_total)
##   data_names        Reporting_areas     dengue_cases        month          
##  Length:13727       Length:13727       Min.   :   0.00   Length:13727      
##  Class :character   Class :character   1st Qu.:  16.00   Class :character  
##  Mode  :character   Mode  :character   Median :  43.00   Mode  :character  
##                                        Mean   :  98.19                     
##                                        3rd Qu.: 103.00                     
##                                        Max.   :8324.00                     
##   temperature      Longitude         Latitude      population_count 
##  Min.   :17.67   Min.   : 98.03   Min.   : 6.178   Min.   : 175121  
##  1st Qu.:26.13   1st Qu.: 99.79   1st Qu.:13.195   1st Qu.: 467476  
##  Median :27.23   Median :100.55   Median :14.857   Median : 639063  
##  Mean   :27.05   Mean   :101.01   Mean   :14.203   Mean   : 841759  
##  3rd Qu.:28.35   3rd Qu.:102.13   3rd Qu.:16.628   3rd Qu.:1062699  
##  Max.   :34.43   Max.   :105.11   Max.   :19.848   Max.   :5716248  
##       year       time_column           incidence      
##  Min.   :2006   Min.   :2006-01-01   Min.   :  0.000  
##  1st Qu.:2009   1st Qu.:2009-11-01   1st Qu.:  2.753  
##  Median :2013   Median :2013-07-01   Median :  6.652  
##  Mean   :2013   Mean   :2013-07-05   Mean   : 11.274  
##  3rd Qu.:2017   3rd Qu.:2017-04-01   3rd Qu.: 13.947  
##  Max.   :2020   Max.   :2020-12-01   Max.   :370.477

#struture data frame and convert into right datatypes

data_total = as.data.frame(data_total)
num_col = c("temperature", "Longitude", "Latitude", "incidence")
int_col = c("dengue_cases","population_count","year")
date_col = c("time_column")
data_total[int_col] = lapply(data_total[int_col], as.integer)
data_total[num_col] = lapply(data_total[num_col], as.numeric)
data_total[date_col] = lapply(data_total[date_col], as.Date)

data_total = subset(data_total, select = c("data_names", "Reporting_areas", "year", "month", "time_column", "dengue_cases", "population_count", "temperature", "Longitude", "Latitude", "incidence"))

save(data_total, file = "data_total.RData")
summary(data_total)
##   data_names        Reporting_areas         year         month          
##  Length:13860       Length:13860       Min.   :2006   Length:13860      
##  Class :character   Class :character   1st Qu.:2009   Class :character  
##  Mode  :character   Mode  :character   Median :2013   Mode  :character  
##                                        Mean   :2013                     
##                                        3rd Qu.:2017                     
##                                        Max.   :2020                     
##                                                                         
##   time_column          dengue_cases     population_count   temperature   
##  Min.   :2006-01-01   Min.   :   0.00   Min.   : 175121   Min.   :17.67  
##  1st Qu.:2009-09-23   1st Qu.:  16.00   1st Qu.: 467411   1st Qu.:26.10  
##  Median :2013-06-16   Median :  43.00   Median : 638660   Median :27.22  
##  Mean   :2013-06-16   Mean   :  98.19   Mean   : 840182   Mean   :27.03  
##  3rd Qu.:2017-03-08   3rd Qu.: 103.00   3rd Qu.:1060272   3rd Qu.:28.33  
##  Max.   :2020-12-01   Max.   :8324.00   Max.   :5716248   Max.   :34.43  
##                       NA's   :133       NA's   :60                       
##    Longitude         Latitude        incidence      
##  Min.   : 98.03   Min.   : 6.178   Min.   :  0.000  
##  1st Qu.: 99.79   1st Qu.:13.195   1st Qu.:  2.753  
##  Median :100.55   Median :14.857   Median :  6.652  
##  Mean   :101.03   Mean   :14.231   Mean   : 11.274  
##  3rd Qu.:102.13   3rd Qu.:16.628   3rd Qu.: 13.947  
##  Max.   :105.11   Max.   :19.848   Max.   :370.477  
##                                    NA's   :133

adding the data from Bunkan-district to Nong Khai and deleting all Bunkan-Data

for (i in 1:dim(data_total)[1]) {
  
  if(data_total$Reporting_areas[i]=="Bungkan"){
    
    Datum = data_total$time_column[i]
    for(j in 1:dim(data_total)[1]){
      if(data_total$Reporting_areas[j] == "Nong Khai" & data_total$time_column[j] == Datum){
        
        data_total$dengue_cases[j]= ifelse(is.na(data_total$dengue_cases[i]), data_total$dengue_cases[j], data_total$dengue_cases[i]+data_total$dengue_cases[j])
        data_total$population_count[j]= ifelse(is.na(data_total$population_count[i]), data_total$population_count[j], data_total$population_count[i]+data_total$population_count[j])
        data_total$incidence[j]= ifelse(is.na(data_total$incidence[i]), data_total$incidence[j], data_total$incidence[i]+data_total$incidence[j])
        
      }
    }
  }
}

#set values of Bungkan equal to Nong Khai for mapping
map_data_total = data_total
for (k in 1:dim(data_total)[1]){
  Datum = data_total$time_column[k]
  if (data_total$Reporting_areas[k]=="Bungkan") {
    for(p in 1:dim(data_total)[1]){
      if(data_total$Reporting_areas[p] == "Nong Khai" & data_total$time_column[p] == Datum){
        map_data_total[k,c("temperature", "incidence", "dengue_cases")] = data_total[p,c("temperature", "incidence", "dengue_cases")]
      }
    }
  }
    
}
  

data_total = data_total[data_total$Reporting_areas != "Bungkan", ]
row.names(data_total) = NULL

#create data frame for monsoon season

vec_month_monsoon = c("Jun", "Jul", "Aug", "Sep")
data_monsoon = data_total[data_total$month %in% vec_month_monsoon,]

rmv.rows.m = apply(data_monsoon,1,function(x){sum(is.na(x))})
i.missing.m = which(rmv.rows.m >0)
clean_data_monsoon = as.data.frame(data_monsoon[-i.missing.m,])

creating a dataframe for the total dengue cases in thailand over the months

month_vec = c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec" ) 
total_thailand = data.frame(data_names = character(), dengue_cases = numeric())

for (i in 2006:2020) {
  name_years = paste0("rows_", i)
  assign(name_years, data_total[data_total$year == i, ]) 
  
  for (month in month_vec) {
    name_months = paste0(name_years, month)
    a = get(name_years)
    assign(name_months, a[a$month == month, ])
    
    b = get(name_months)
    total_cases = sum(b$dengue_cases, na.rm = T)
    name_column = paste0(i,"_", month)
    placeholder_df = data.frame(data_names = name_column, dengue_cases = total_cases)   
    total_thailand = rbind(total_thailand, placeholder_df)
    
  }
}

total_thailand = cbind(total_thailand, time_column_short)

#max, min, sd

apply(data_total[,c(num_col, int_col)], 2, function(x){max(x,na.rm=TRUE)})
##      temperature        Longitude         Latitude        incidence 
##     3.443087e+01     1.051115e+02     1.984765e+01     3.704772e+02 
##     dengue_cases population_count             year 
##     8.324000e+03     5.716248e+06     2.020000e+03
apply(data_total[,c(num_col, int_col)], 2, function(x){min(x,na.rm=TRUE)})
##      temperature        Longitude         Latitude        incidence 
##     1.766781e+01     9.803039e+01     6.177542e+00     0.000000e+00 
##     dengue_cases population_count             year 
##     0.000000e+00     1.751210e+05     2.006000e+03
apply(data_total[,c(num_col, int_col)], 2, function(x){mean(x,na.rm=TRUE)})
##      temperature        Longitude         Latitude        incidence 
##         27.03971        100.99276         14.17976         11.37365 
##     dengue_cases population_count             year 
##         99.06063     847552.03596       2013.00000
apply(data_total[,c(num_col, int_col)], 2, function(x){median(x,na.rm=TRUE)})
##      temperature        Longitude         Latitude        incidence 
##     2.722116e+01     1.005451e+02     1.483826e+01     6.719759e+00 
##     dengue_cases population_count             year 
##     4.300000e+01     6.612255e+05     2.013000e+03
apply(data_total[,c(num_col, int_col)], 2, function(x){sd(x,na.rm=TRUE)})
##      temperature        Longitude         Latitude        incidence 
##     2.117061e+00     1.717002e+00     3.478485e+00     1.571335e+01 
##     dengue_cases population_count             year 
##     2.035782e+02     7.219595e+05     4.320652e+00

#percentage in the monsoon season

#percentage monsoon cases verses over all cases
monsoon_perc = sum(data_monsoon$incidence, na.rm = T)/sum(data_total$incidence, na.rm = T)
monsoon_perc
## [1] 0.5601323
#average temperature in monsoon vs over all
mean(data_monsoon$temperature, na.rm = T)
## [1] 27.68275
mean(data_total$temperature, na.rm = T)
## [1] 27.03971
data_not_monsoon = data_total[!(data_total$month %in% c("Jun", "Jul", "Aug", "Sep")),]
mean(data_not_monsoon$temperature, na.rm = T)
## [1] 26.71819

#mean of dengue_cases and temperature of whole thailand per month

library(ggplot2)
## Warning: Paket 'ggplot2' wurde unter R Version 4.3.1 erstellt
library(scales)
## Warning: Paket 'scales' wurde unter R Version 4.3.1 erstellt
#dengue cases of whole thailand per month
f_mean_dengue = function(){
  mean_dengue <- data.frame(dengue_cases = numeric(), time_column = as.Date(character()))
  months = seq(from = as.Date("2006-01-01"), to = as.Date("2020-12-01"), by = "month")
  for (i in months) {
    month = data_total[data_total$time_column == i,]
    m_month = data.frame(dengue_cases = mean(month$dengue_cases, na.rm = TRUE), time_column = month$time_column[1])
    mean_dengue = rbind(mean_dengue, m_month)
  }
  return(mean_dengue)
}


f_mean_temp = function(){
  mean_temp <- data.frame(temperature = numeric(), time_column = as.Date(character()))
  months = seq(from = as.Date("2006-01-01"), to = as.Date("2020-12-01"), by = "month")
  for (i in months) {
    month = data_total[data_total$time_column == i,]
    m_month = data.frame(temperature = mean(month$temperature, na.rm = TRUE), time_column = month$time_column[1])
    mean_temp = rbind(mean_temp, m_month)
  }
  return(mean_temp)
}

plot1 = ggplot2::ggplot() +
  geom_line(data = f_mean_dengue(), aes(x = time_column, y = dengue_cases)) +
  labs(x = "time", y = "dengue-cases") +
  scale_x_continuous(breaks = seq(from = as.Date("2006-01-01"), to = as.Date("2020-12-01"), by = "2 year"))

plot2 = ggplot2::ggplot() +
  geom_line(data = f_mean_temp(), aes(x = time_column, y = temperature), color = "red") +
  labs(x = "time", y = "temperature") +
  scale_x_continuous(breaks = seq(from = as.Date("2006-01-01"), to = as.Date("2020-12-01"), by = "2 year"))


gridExtra::grid.arrange(plot1, plot2, nrow = 2)

#mean and median comparison

library(ggplot2)

f_median_dengue = function(){
  median_dengue <- data.frame(dengue_cases = numeric(), time_column = as.Date(character()))
  months = seq(from = as.Date("2006-01-01"), to = as.Date("2020-12-01"), by = "month")
  for (i in months) {
    month = data_total[data_total$time_column == i,]
    m_month = data.frame(dengue_cases = median(month$dengue_case, na.rm = TRUE), time_column = month$time_column[1])
    median_dengue = rbind(median_dengue, m_month)
  }
  return(median_dengue)
}

f_median_temp = function(){
  median_temp <- data.frame(temperature = numeric(), time_column = as.Date(character()))
  months = seq(from = as.Date("2006-01-01"), to = as.Date("2020-12-01"), by = "month")
  for (i in months) {
    month = data_total[data_total$time_column == i,]
    m_month = data.frame(temperature = median(month$temperature, na.rm = TRUE), time_column = month$time_column[1])
    median_temp = rbind(median_temp, m_month)
  }
  return(median_temp)
}
#compute difference between mean and median
dif_mm_dengue = f_mean_dengue()["dengue_cases"]-f_median_dengue()["dengue_cases"]
dif_mm_dengue = cbind(dif_mm_dengue,f_mean_dengue()["time_column"])

dif_mm_temp = f_mean_temp()["temperature"]-f_median_temp()["temperature"]
dif_mm_temp = cbind(dif_mm_temp,f_mean_temp()["time_column"])

plot_mean_temp = ggplot2::ggplot() +
  geom_line(data = f_mean_temp(), aes(x = time_column, y = temperature)) +
  labs(x = "time", y = "temperature", title = "Mean temperature")

plot_mean_dengue = ggplot2::ggplot() +
  geom_line(data = f_mean_dengue(), aes(x = time_column, y = dengue_cases)) +
  labs(x = "time", y = "dengue-cases", title = "Mean dengue cases")

plot_median_temp = ggplot2::ggplot() +
  geom_line(data = f_median_temp(), aes(x = time_column, y = temperature)) +
  labs(x = "time", y = "temperature", title = "Median temperature")

plot_median_dengue = ggplot2::ggplot() +
  geom_line(data = f_median_dengue(), aes(x = time_column, y = dengue_cases)) +
  labs(x = "time", y = "dengue-cases", title = "Median dengue cases")

plot_dif_dengue = ggplot2::ggplot() +
  geom_line(data = dif_mm_dengue, aes(x = time_column, y = dengue_cases)) +
  labs(x = "time", y = "dengue-cases", title = "Difference between mean and median")

plot_dif_temp = ggplot2::ggplot() +
  geom_line(data = dif_mm_temp, aes(x = time_column, y = temperature)) +
  labs(x = "time", y = "temperature", title = "Difference between mean and median")

gridExtra::grid.arrange(plot_mean_dengue, plot_mean_temp, plot_median_dengue, plot_median_temp,plot_dif_dengue,plot_dif_temp, nrow = 3)

#mean of dengue_cases and temperature of whole thailand per year

f_mean_dengue.y.t = function(){
  mean_dengue.y.t <- data.frame(dengue_cases = numeric(), time_column = as.Date(character()))
  for (i in c(2006:2020)) {
    month = data_total[data_total$year == i,]
    m_month = data.frame(dengue_cases = mean(month$dengue_cases, na.rm = TRUE), time_column = month$time_column[1])
    mean_dengue.y.t = rbind(mean_dengue.y.t, m_month)
  }
  return(mean_dengue.y.t)
}
mean_dengue.y.t = as.data.frame(f_mean_dengue.y.t())

f_mean_temp.y.t = function(){
  mean_temp.y.t <- data.frame(temperature = numeric(), time_column = as.Date(character()))
  for (i in c(2006:2020)) {
    month = data_total[data_total$year == i,]
    m_month = data.frame(temperature = mean(month$temperature, na.rm = TRUE), time_column = month$time_column[1])
    mean_temp.y.t = rbind(mean_temp.y.t, m_month)
  }
  return(mean_temp.y.t)
}
mean_temp.y.t = as.data.frame(f_mean_temp.y.t())

#mean of incidence and temperature of each provinze per year

f_mean_inc.y.p = function(period, areas){
  mean_inc.y.p <- data.frame(incidence = numeric(), time_column = as.Date(character()), Reporting_areas = character())
  for (i in period){
    year = data_total[data_total$year == i,]
    for (p in areas) {
      provinz = year[year$Reporting_areas == p,]
      m_y_p = data.frame(incidence = mean(provinz$incidence, na.rm = TRUE), time_column = provinz$time_column[1], Reporting_areas = provinz$Reporting_areas[1])
      mean_inc.y.p = rbind(mean_inc.y.p, m_y_p)
    }
  }
  return(mean_inc.y.p)
}

f_mean_temp.y.p = function(period, areas){
  mean_temp.y.p <- data.frame(temperature = numeric(), time_column = as.Date(character()), Reporting_areas = character())
  for (i in period){
    year = data_total[data_total$year == i,]
    for (p in areas) {
      provinz = year[year$Reporting_areas == p,]
      m_y_p = data.frame(temperature = mean(provinz$incidence, na.rm = TRUE), time_column = provinz$time_column[1], Reporting_areas = provinz$Reporting_areas[1])
      mean_temp.y.p = rbind(mean_temp.y.p, m_y_p)
    }
  }
  return(mean_temp.y.p)
}

#Plot für Bangkok

dengue_plot_cases = c()
dengue_plot_time = c()

dengue_plot_cases <- data_total$dengue_cases[data_total$Reporting_areas == "Bangkok"]
dengue_plot_time <- data_total$time_column[data_total$Reporting_areas == "Bangkok"]

plot(dengue_plot_time,dengue_plot_cases, type = "o")

#distribution of incidence

q.dengue = quantile(data_total$incidence, probs = c(0.01,0.1,0.25,0.5,0.75,0.9,0.99), na.rm = TRUE)
hist(data_total$incidence,  breaks = 200, main = "Distribution of incidence", xlab = "incidence", freq = F);abline(v=q.dengue,lty=3,lwd=2,col='red')
lines(density(data_total$incidence, na.rm = T), col = "blue", lwd = 2)

q.temp = quantile(data_total$temperature, probs = c(0.01,0.1,0.25,0.5,0.75,0.9,0.99), na.rm = TRUE)
hist(data_total$temperature,  breaks = 200, main = "Distribution of temperature", xlab = "temperature", freq = F);abline(v=q.temp,lty=3,lwd=2,col='red')
curve(dnorm(x, mean = mean(data_total$temperature, na.rm = TRUE), sd = sd(data_total$temperature, na.rm = TRUE)), 
      add = TRUE, col = "green", lwd = 2)
lines(density(data_total$temperature, na.rm = T), col = "blue", lwd = 2)

#correlation between temperature and incidence

library(npreg)
## Warning: Paket 'npreg' wurde unter R Version 4.3.1 erstellt
library(ggplot2)

smoothed_curve <- ss(clean_data_total$temperature, clean_data_total$incidence, nknots = 10)

plot(clean_data_total$temperature, clean_data_total$incidence, pch = 20,
     ylab = "incidence",
     xlab = "temperature",
  abline(v = mean(clean_data_total$temperature), col = "red"))

plot(smoothed_curve, col = "blue")

plot(rank(data_total$temperature), rank(data_total$incidence), pch = 20,
     ylab = "incidence",
     xlab = "temperature",
  abline(v = mean(data_total$temperature), col = "red"))

qqplot(data_total$temperature, data_total$incidence)

#pearson correlation y.t.
cor(mean_dengue.y.t$dengue_cases, mean_temp.y.t$temperature)
## [1] 0.3958178
#spearman correlation y.t.
cor(mean_dengue.y.t$dengue_cases, mean_temp.y.t$temperature, method = "spearman")
## [1] 0.2913317
#pearson correlation m.p.
cor(clean_data_total$dengue_cases, clean_data_total$temperature)
## [1] 0.1225488
#spearman correlation m.p.
cor(clean_data_total$dengue_cases, clean_data_total$temperature, method = "spearman")
## [1] 0.2593817
pheatmap::pheatmap(cor(clean_data_total[c("dengue_cases", "population_count", "temperature", "incidence")]))

#monsoon season smoothed curve incidence temperature

library(npreg)
library(ggplot2)

smoothed_curve_m <- ss(clean_data_monsoon$temperature, clean_data_monsoon$incidence, nknots = 10)
plot(clean_data_monsoon$temperature, clean_data_monsoon$incidence, pch = 20,
     ylab = "incidence",
     xlab = "temperature",
  abline(v = mean(clean_data_monsoon$temperature), col = "red"))

plot(smoothed_curve, col = "blue")

#only for quantile 5% to 95%
lower_limit = quantile(clean_data_monsoon$temperature, probs = 0.05, na.rm = T)
upper_limit = quantile(clean_data_monsoon$temperature, probs = 0.95, na.rm = T)

clean_data_monsoon_5.95 = subset(clean_data_monsoon, temperature >= lower_limit & temperature <= upper_limit)

smoothed_curve_m_5.95 <- ss(clean_data_monsoon_5.95$temperature, clean_data_monsoon_5.95$incidence, nknots = 10)
plot(clean_data_monsoon_5.95$temperature, clean_data_monsoon_5.95$incidence, pch = 20,
     ylab = "incidence",
     xlab = "temperature",
  abline(v = mean(clean_data_monsoon_5.95$temperature), col = "red"))

plot(smoothed_curve_m_5.95, col = "blue")

#correlation between different areas

#mean of temp and incidence over all years per area
t_data_total = t(data_total)
r_areas = unique(data_total$Reporting_areas)
m_inc_temp.y.r = data.frame(Reporting_areas = character(), incidence = numeric(), temperature = numeric())
for (i in r_areas){
  temp_inc = apply(data_total[which(clean_data_total$Reporting_areas == i),c("incidence", "temperature")], 2,function(g){mean(g, na.rm = TRUE)})
  temp_inc_df = data.frame(Reporting_areas = i, incidence = temp_inc["incidence"], temperature = temp_inc["temperature"])
  m_inc_temp.y.r = rbind(m_inc_temp.y.r, temp_inc_df)
  rownames(m_inc_temp.y.r) <- 1:nrow(m_inc_temp.y.r)
}

#k-means clustering

#elbow method
km = kmeans(m_inc_temp.y.r["incidence"], centers = 2, nstart = 10)
km$tot.withinss
## [1] 237.1706
wss = sapply(2:7,function(k) { 
  kmeans(m_inc_temp.y.r["incidence"], centers = k)$tot.withinss
})
plot(2:7,wss,type='b',pch=19,xlab="Number of clusters K",
     ylab="Total within-clusters sum of squares")

#silhoutte method
library(cluster)
## Warning: Paket 'cluster' wurde unter R Version 4.3.1 erstellt
#library (NbClust)
#library (clustertend)
library (factoextra)
## Warning: Paket 'factoextra' wurde unter R Version 4.3.1 erstellt
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
#library (fpc)
#library (clValid)
library(ggplot2)

#find optimal number of clusters by incidence
D_inc = dist(m_inc_temp.y.r["incidence"])
km_inc = kmeans(m_inc_temp.y.r["incidence"], centers = 4, nstart = 10)
s_inc = silhouette(km_inc$cluster,D_inc)
plot(s_inc)

fviz_nbclust(m_inc_temp.y.r["incidence"], pam, method = "silhouette")+ theme_classic()

#find optimal number of clusters by temperature
D_temp = dist(m_inc_temp.y.r["temperature"])
km_temp = kmeans(m_inc_temp.y.r["temperature"], centers = 4, nstart = 10)
s_temp = silhouette(km_temp$cluster,D_temp)
plot(s_temp)

fviz_nbclust(m_inc_temp.y.r["temperature"], pam, method = "silhouette")+ theme_classic()

#find optimal number of clusters by temperature and incidence
D_inc_temp = dist(scale(m_inc_temp.y.r[,c("incidence","temperature")]))
km_inc_temp = kmeans(scale(m_inc_temp.y.r[,c("incidence","temperature")]), centers = 3)
s_inc_temp = silhouette(km_inc_temp$cluster,D_inc_temp)
plot(s_inc_temp)

fviz_nbclust(scale(m_inc_temp.y.r[,c("incidence","temperature")]), pam, method = "silhouette")+ theme_classic()

m_inc_temp_cluster.y.r = data.frame(m_inc_temp.y.r, km_inc_temp$cluster)

plot1 = ggplot(m_inc_temp.y.r, aes(x = temperature, y = incidence, color = as.factor(km_inc$cluster))) +
  geom_point() +
  labs(title = "Partitioning Clustering Plot by incidence") +
  theme_classic()

plot2 = ggplot(m_inc_temp.y.r, aes(x = temperature, y = incidence, color = as.factor(km_temp$cluster))) +
  geom_point() +
  labs(title = "Partitioning Clustering Plot by temperature") +
  theme_classic()
plot2

plot3 = ggplot(m_inc_temp.y.r, aes(x = temperature, y = incidence, color = as.factor(km_inc_temp$cluster))) +
  geom_point() +
  labs(title = "Partitioning Clustering Plot") +
  theme_classic()
plot3

gridExtra::grid.arrange(plot1, plot2, plot3, ncol = 1)

#probieren mit time series Objekten

my_ts <- ts(data_total$dengue_cases, start = c(data_total$time_column[1]), frequency = 12)
head(my_ts)
## [1] NA NA  1 12 29 44
TSstudio::ts_plot(my_ts)

#GAM

library(readxl)
library(nlme)
## Warning: Paket 'nlme' wurde unter R Version 4.3.1 erstellt
## 
## Attache Paket: 'nlme'
## Das folgende Objekt ist maskiert 'package:dplyr':
## 
##     collapse
library(gam)
## Warning: Paket 'gam' wurde unter R Version 4.3.1 erstellt
## Lade nötiges Paket: splines
## Lade nötiges Paket: foreach
## Warning: Paket 'foreach' wurde unter R Version 4.3.1 erstellt
## Loaded gam 1.22-2
library(sdm)
## Warning: Paket 'sdm' wurde unter R Version 4.3.1 erstellt
## Lade nötiges Paket: sp
## Warning: Paket 'sp' wurde unter R Version 4.3.1 erstellt
## The legacy packages maptools, rgdal, and rgeos, underpinning this package
## will retire shortly. Please refer to R-spatial evolution reports on
## https://r-spatial.org/r/2023/05/15/evolution4.html for details.
## This package is now running under evolution status 0
## sdm 1.1-8 (2021-11-11)
library(raster)
## Warning: Paket 'raster' wurde unter R Version 4.3.1 erstellt
## 
## Attache Paket: 'raster'
## Das folgende Objekt ist maskiert 'package:nlme':
## 
##     getData
## Das folgende Objekt ist maskiert 'package:dplyr':
## 
##     select
library(ncdf4)
library(mgcv)
## Warning: Paket 'mgcv' wurde unter R Version 4.3.1 erstellt
## This is mgcv 1.8-42. For overview type 'help("mgcv-package")'.
## 
## Attache Paket: 'mgcv'
## Die folgenden Objekte sind maskiert von 'package:gam':
## 
##     gam, gam.control, gam.fit, s
attach(data_total)
## Die folgenden Objekte sind maskiert durch .GlobalEnv:
## 
##     data_names, month, time_column, year
model1 = gam(incidence~s(temperature, k = 16), family = "quasipoisson", na.rm = T)
plot.gam(model1)

summary(model1) #edf: → a value close to 1 tends to be close to a linear term → a higher value means that the spline is more wiggly (non-linear). 
## 
## Family: quasipoisson 
## Link function: log 
## 
## Formula:
## incidence ~ s(temperature, k = 16)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.36386    0.01243   190.2   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                  edf Ref.df     F p-value    
## s(temperature) 13.82  14.76 42.97  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0465   Deviance explained = 9.31%
## GCV = 11.771  Scale est. = 19.981    n = 13607
#F-Wert und der p-Wert geben die statistische Signifikanz der glatten Funktion an. In diesem Fall ist die glatte Funktion von "temperature" hoch signifikant (p-Wert < 2e-16), was darauf hindeutet, dass sie einen signifikanten Einfluss auf die Vorhersage der "incidence" hat.Der angegebene R-squared-Wert (R-sq.(adj)) von 0.0407 gibt an, wie gut das Modell die Varianz der abhängigen Variable erklärt. In diesem Fall erklärt das Modell etwa 4,07% der Varianz in den Daten. Ein niedriger GCV-Wert deutet auf eine gute Anpassung des Modells hin.
k.check(model1)
##                k'      edf   k-index p-value
## s(temperature) 15 13.81749 0.9896895  0.8025
gam.check(model1)

## 
## Method: GCV   Optimizer: outer newton
## full convergence after 4 iterations.
## Gradient range [2.609013e-05,2.609013e-05]
## (score 11.77115 & scale 19.98141).
## Hessian positive definite, eigenvalue range [0.001424269,0.001424269].
## Model rank =  16 / 16 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                  k'  edf k-index p-value
## s(temperature) 15.0 13.8    0.98    0.66
best_aic = Inf
best_model = NULL
k_best = 0
for (i in 3:50){
      # Calculate AIC
      smooth_model = gam(incidence~s(temperature, k = i))
      aic = AIC(smooth_model)
      # Check if the current model has lower AIC than the best model so far
      if (aic < best_aic) {
        best_aic = aic
        best_model = smooth_model
        k_best = i
      }
}
best_aic
## [1] 112947.1
k_best
## [1] 16
smooth_model = gam(incidence~s(temperature, k = 16))
linear_model = gam(incidence~temperature, k = 16)
AIC(smooth_model, linear_model)
##                    df      AIC
## smooth_model 14.88464 112947.1
## linear_model  3.00000 113361.2
detach(data_total)

GAM forecast

library(raster)
library(gam)
library(sdm)
library(readxl)
library(mgcv)
library(ncdf4)

## lower_limit = quantile(data_monsoon$temperature, probs = 0.05, na.rm = T)
#upper_limit = quantile(data_monsoon$temperature, probs = 0.95, na.rm = T)

#data_monsoon_5.95 = subset(data_monsoon, temperature >= lower_limit & temperature <= upper_limit)

attach(data_total)
## Die folgenden Objekte sind maskiert durch .GlobalEnv:
## 
##     data_names, month, time_column, year
model2 <- gam(incidence~s(temperature, k = 16), family = "quasipoisson")
tas <- stack("tas_SEA22_MPI_rcp85_2021-2040_grid_subc_daymean_monmean_timmean.nc")
#tas_test = crop(tas, my.area)
names(tas) <- "temperature"
p1 <- raster::predict(tas, model2, type = "response")

thailand <- raster::getData('GADM', country='THA', level=1)
## Warning in raster::getData("GADM", country = "THA", level = 1): getData will be removed in a future version of raster
## . Please use the geodata package instead
my.area <- extent(thailand)
my.area = extent(95,107,5.5,20.5)
my.p1 <- crop(p1, my.area)
my.p1.mask <- mask(my.p1, thailand)


plot(my.p1.mask, legend.width=2, legend.shrink=0.5, axes= F, box=F, zlim = c(0,22.8)) + plot(thailand, add=TRUE)

## integer(0)
detach(data_total)

#Plotting der Inzidenz eines einzelnen Monats auf die Karte von Thailand

library(sf)
## Warning: Paket 'sf' wurde unter R Version 4.3.1 erstellt
## Linking to GEOS 3.11.2, GDAL 3.6.2, PROJ 9.2.0; sf_use_s2() is TRUE
library(ggplot2)


district_sf <- st_read("gadm36_THA_shp/gadm36_THA_1.shp")
## Reading layer `gadm36_THA_1' from data source 
##   `C:\Users\thadl\OneDrive\Dokumente\GitHub\topic04_team04\gadm36_THA_shp\gadm36_THA_1.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 77 features and 10 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 97.34519 ymin: 5.616042 xmax: 105.6391 ymax: 20.46321
## Geodetic CRS:  WGS 84
#Inzidenz vom April 2006 auf die karte geplottet

#district_sf$incidence2006Aug <- rows_2006Aug$incidence

#ggplot() +
 # geom_sf(data = district_sf, aes(fill = incidence2006Aug)) +
 # scale_fill_gradient(low = "yellow", high = "red") + labs(fill = "Incidence in Aug 2006")

#Funktion für Inzidenz plot der Monate

library(sf)
library(ggplot2)


district_sf <- st_read("gadm36_THA_shp/gadm36_THA_1.shp")

map_incidence_monthly = function(incidence_month){
  
 max_incidence = which.max(incidence_month$incidence)
max_incidence_area = incidence_month$Reporting_areas[max_incidence]



ggplot() +
  geom_sf(data = district_sf, aes(fill = incidence_month$incidence)) +
  scale_fill_gradient(low = "white", high = "green") +
  labs(fill = paste("Highest incidence =", max_incidence, "in", max_incidence_area))
  
  
}
#input for incidence month in format: rows_YYYY_Mon

map_temp_monthly = function(temp_month){
  
max_temp = which.max(temp_month$temperature)
max_temp_area = temp_month$Reporting_areas[max_temp]



ggplot() +
  geom_sf(data = district_sf, aes(fill = temp_month$temperature)) +
  scale_fill_gradient(low = "white", high = "green") +
  labs(fill = paste("Highest temperature =", max_temp, "in", max_temp_area))
  
  
}
##map_incidence_monthly(f_mean_inc.y.p(period = c(2006:2020), areas = r_areas))

##gridExtra::grid.arrange(map_temp_monthly(f_mean_temp.y.p(period = c(2020), areas = r_areas)),map_incidence_monthly(f_mean_inc.y.p(period = c(2020), areas = r_areas)))

#average incidence 2006 to 2020 map

library(ggplot2)
library(sf)

f_map_mean_inc.y.p = function(period, areas){
  map_mean_inc.y.p <- data.frame(incidence = numeric(), time_column = as.Date(character()), Reporting_areas = character())
  for (i in period){
    year = map_data_total[map_data_total$year == i,]
    for (p in areas) {
      provinz = year[year$Reporting_areas == p,]
      m_y_p = data.frame(incidence = mean(provinz$incidence, na.rm = TRUE), time_column = provinz$time_column[1], Reporting_areas = provinz$Reporting_areas[1])
      map_mean_inc.y.p = rbind(map_mean_inc.y.p, m_y_p)
    }
  }
  return(map_mean_inc.y.p)
}

f_map_mean_inc.p = function(areas){
  map_mean_inc.p <- data.frame(time_column = as.Date(character()), Reporting_areas = character(), incidence = numeric())
  for (p in areas) {
      provinz = map_data_total[map_data_total$Reporting_areas == p,]
      m_p = data.frame(time_column = provinz$time_column[1], Reporting_areas = provinz$Reporting_areas[1],incidence = mean(provinz$incidence, na.rm = TRUE))
      map_mean_inc.p = rbind(map_mean_inc.p, m_p)
  }
  return(map_mean_inc.p)
}


r_areas2 = unique(data_total$Reporting_areas)
mean_inc.p <- data.frame(time_column = as.Date(character()), Reporting_areas = character(), incidence = numeric())
  for (p in r_areas2) {
      provinz = data_total[data_total$Reporting_areas == p,]
      m_p = data.frame(time_column = provinz$time_column[1], Reporting_areas = provinz$Reporting_areas[1],incidence = mean(provinz$incidence, na.rm = TRUE))
      mean_inc.p = rbind(mean_inc.p, m_p)
}

map_mean_inc.p = f_map_mean_inc.p(areas = unique(map_data_total$Reporting_areas))
district_sf = st_read("gadm36_THA_shp/gadm36_THA_1.shp")
## Reading layer `gadm36_THA_1' from data source 
##   `C:\Users\thadl\OneDrive\Dokumente\GitHub\topic04_team04\gadm36_THA_shp\gadm36_THA_1.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 77 features and 10 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 97.34519 ymin: 5.616042 xmax: 105.6391 ymax: 20.46321
## Geodetic CRS:  WGS 84
district_sf$incidence <- map_mean_inc.p$incidence


ggplot2::ggplot() +
  geom_sf(data = district_sf, aes(fill = incidence)) +
  scale_fill_gradientn(colors = rev(terrain.colors(100)), limits = c(0,22.8)) + labs(fill = "Incidence") + labs(title = "Average incidences for 2006-2020 ")